################################################################################
#        CAS Data Engineering Modul 2 - "Crypto Gruppe"                        #
#        Time Series Analysis of Bitcoin Prices with R                         #
#         -> Statistical Tests and ARIMA Experiment <-                         #
#                                                                              #
#                         09 July 2021                                         #
#                                                                              #
################################################################################
# References: 
#
# SUMNER, T. 14/11/2019. Forecasting Bitcoin in R. 
# Available from: https://rstudio-pubs-static.s3.amazonaws.com/549884_39fa223876e448608b7a7fa79337feba.html
#
# JAQUART, P. DANN, D. WEINHARDT, Ch. Short-term bitcoin market prediction via machine learning
# Available online at www.sciencedirect.com
#
# URAS, N. MARCHESI, L. MARCHESI, M. TONELLI, R. Forecasting Bitcoin closing price series using linear regression and neural networks models
# Uras et al. (2020), PeerJ Comput. Sci., DOI 10.7717/peerj-cs.279
#
#
################################################################################


# Remove Data & Libraries
rm(list=ls())
# Load Libraries
library(foreign)
library(psych)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::%+%()   masks psych::%+%()
## x ggplot2::alpha() masks psych::alpha()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(fpp2)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## v forecast  8.15     v expsmooth 2.3 
## v fma       2.4
## -- Conflicts ------------------------------------------------- fpp2_conflicts --
## x ggplot2::%+%()   masks psych::%+%()
## x ggplot2::alpha() masks psych::alpha()
library(astsa)
## 
## Attaching package: 'astsa'
## The following objects are masked from 'package:fma':
## 
##     chicken, sales
## The following object is masked from 'package:forecast':
## 
##     gas
## The following object is masked from 'package:fpp2':
## 
##     oil
## The following object is masked from 'package:psych':
## 
##     scatter.hist
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tseries)
# Initial configurations
# Set default font
windowsFonts(Georgia = windowsFont("Georgia"))


## Set Original plot theme
my_theme = theme(panel.grid = element_line(color = '#e6e6e6'),
                 panel.background = element_rect(fill = 'white'),
                 plot.title = element_text(hjust = .5, size = 28, 
                                           colour = '#ffa500'),
                 text = element_text(family = 'Georgia'),
                 axis.text = element_text(size = 10),
                 axis.title = element_text(size = 18, family = 'Georgia', 
                                           face = 'bold'),
                 axis.line = element_line(colour = '#737373', size = 1),
                 strip.background = element_rect(colour = "black", 
                                                 fill = "white"),
                 strip.text = element_text(face = 'bold'))  
# Read Data
# Read Original Bitcoin Dataset
bitcoin <- read_csv('bitcoin_full_daily_prices.csv')
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   sentiment_elon = col_character(),
##   sentiment_bitcoin = col_character()
## )
## i Use `spec()` for the full column specifications.
names(bitcoin)
##  [1] "Date"                   "Open"                   "High"                  
##  [4] "Low"                    "Close"                  "WeightedPrice"         
##  [7] "Volume"                 "SMA_30"                 "EMA_40"                
## [10] "Altcoin_EMA_40"         "DASH"                   "DOGE"                  
## [13] "ETC"                    "ETH"                    "LTC"                   
## [16] "SC"                     "XEM"                    "XMR"                   
## [19] "XRP"                    "ZEC"                    "CLF"                   
## [22] "CNYUSDX"                "DJI"                    "EURUSDX"               
## [25] "GCF"                    "GSPC"                   "IXIC"                  
## [28] "JPYUSDX"                "TSLA"                   "VIX"                   
## [31] "XWDTO"                  "Cost_per_TR"            "Num_TR_per_Block"      
## [34] "Bu_Be_Spread_MA8"       "SMA_05"                 "SMA_90"                
## [37] "EMA_05"                 "EMA_90"                 "MACD"                  
## [40] "Avg_Dir_Mvmt"           "RSI"                    "Awesome_Osc"           
## [43] "ROC"                    "Stoch_RSI"              "Ultimate_Osc"          
## [46] "True_SI"                "Cum_Return"             "Log_Return"            
## [49] "Number_of_Transactions" "Active_Addresses"       "New_Addresses"         
## [52] "Hash_Rate"              "sentiment_elon"         "sentiment_bitcoin"
# Read United BTC Dataset (used in the Python Modelling)
btc <- read_csv('bitcoin_full_daily_returns.csv')
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   sentiment_elon = col_character(),
##   sentiment_bitcoin = col_character()
## )
## i Use `spec()` for the full column specifications.
names(btc)
##  [1] "Date"                   "WeightedPrice_return"   "DASH_return"           
##  [4] "DOGE_return"            "ETC_return"             "ETH_return"            
##  [7] "LTC_return"             "SC_return"              "XEM_return"            
## [10] "XMR_return"             "XRP_return"             "ZEC_return"            
## [13] "CLF_return"             "CNYUSDX_return"         "DJI_return"            
## [16] "EURUSDX_return"         "GCF_return"             "GSPC_return"           
## [19] "IXIC_return"            "JPYUSDX_return"         "TSLA_return"           
## [22] "VIX_return"             "XWDTO_return"           "Volume"                
## [25] "SMA_30"                 "EMA_40"                 "Altcoin_EMA_40"        
## [28] "Cost_per_TR"            "Num_TR_per_Block"       "Bu_Be_Spread_MA8"      
## [31] "SMA_05"                 "SMA_90"                 "EMA_05"                
## [34] "EMA_90"                 "MACD"                   "Avg_Dir_Mvmt"          
## [37] "RSI"                    "Awesome_Osc"            "ROC"                   
## [40] "Stoch_RSI"              "Ultimate_Osc"           "True_SI"               
## [43] "Cum_Return"             "Log_Return"             "Number_of_Transactions"
## [46] "Active_Addresses"       "New_Addresses"          "Hash_Rate"             
## [49] "sentiment_elon"         "sentiment_bitcoin"
# Create Time series Dataset
# Create original time series Dataset
bit_ts = bitcoin %>%
  filter(Date > as.Date('2017-01-01')) %>%
  arrange(Date) %>%
  select(WeightedPrice) %>%
  as.matrix() %>%
  ts()


# Create time series from BTC returns Dataset
bit_ret_ts = btc %>%
  filter(Date > as.Date('2017-01-01')) %>%
  arrange(Date) %>%
  select(WeightedPrice_return) %>%
  as.matrix() %>%
  ts()
# Plot BTC Prices (full Dataset)
# Plot BTC Data
ggplotly(ggplot(bitcoin, aes(Date, WeightedPrice)) + geom_line(col = '#ffa500') + 
           labs(title = 'Bitcoin Weighted Prices 2014 -2021', x = '') +
           scale_y_continuous(breaks = c(0, 5000, 10000, 15000, 30000, 60000), 
                              labels = c('$0', '$5,000', '$10,000', '$15,000', 
                                         '$30,000', '$60,000')) + my_theme)
# Plot BTC Prices after 2017
ggplotly(bitcoin %>%
           filter(Date > as.Date('2017-01-01')) %>% ggplot(aes(Date, 
                                                               WeightedPrice)) + 
           geom_line(col = '#ffa500') + 
           labs(title = 'Bitcoin Weighted Prices after 2017', x = '') +
           scale_y_continuous(breaks = c(0, 5000, 10000, 15000, 30000, 60000), 
                              labels = c('$0', '$5,000', '$10,000', '$15,000', 
                                         '$30,000', '$60,000')) + my_theme)
# Correlation plots for BTC Prices & its lags
gglagplot(bit_ts, do.lines = F) + my_theme +
  scale_color_continuous(low = "#b37400", high = "#ffc04d", 
                         breaks = c(1, 366, 731, 1097, 1463), 
                         labels = c('2017', '2018', '2019', '2020', '2021')) + 
  scale_y_continuous(breaks = c(0, 5000, 10000, 15000, 30000, 60000), 
                     labels = c('$0', '$5,000', '$10,000', '$15,000', 
                                '$30,000', '$60,000')) +
  scale_x_continuous(breaks = c(5000, 10000, 15000, 30000, 60000), 
                     labels = c('$5,000', '$10,000', '$15,000', 
                                '$30,000', '$60,000'))

# Autocorrelation (ACF) and Partial Autocorrelation (PACF) plots
ggAcf(bit_ts, lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')

ggPacf(bit_ts, lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

# Autocorrelation (ACF) and Partial Autocorrelation (PACF) after differencing
ggAcf(diff(bit_ts), lag.max = 200) + my_theme + 
  labs(title = 'ACF with First Differnce' , y = 'Correlation') 

ggPacf(diff(bit_ts), lag.max = 200) + my_theme + 
  labs(title = 'PACF with First Difference', y = '')

# Autocorrelation (ACF) and Partial Autocorrelation (PACF) with daily returns 
ggAcf(bit_ret_ts, lag.max = 200) + my_theme + 
  labs(title = 'ACF with Daily Returns' , y = 'Correlation') 

ggPacf(bit_ret_ts, lag.max = 200) + my_theme + 
  labs(title = 'PACF with Daily Returns', y = '')

# Plot First Difference after 2017
cut_bit_df = bitcoin %>%
  filter(Date > as.Date('2017-01-01'))

ggplotly(cut_bit_df[-1,] %>%
           mutate(WeightedPrice = diff(cut_bit_df$WeightedPrice)) %>%
           ggplot(aes(Date, WeightedPrice)) + geom_line(col = '#ffa500') + 
           my_theme + labs(x = '', title = 'Bitcoin Differenced By One', 
                           y = 'Difference'))
# Plot Daily Returns after 2017
cut_bit_ret_df = btc %>%
  filter(Date > as.Date('2017-01-01'))

ggplotly(cut_bit_ret_df[-1,] %>%
           mutate(WeightedPrice_return = diff(
             cut_bit_ret_df$WeightedPrice_return)) %>%
           ggplot(aes(Date, WeightedPrice_return)) + geom_line(col = '#ffa500') + 
           my_theme + labs(x = '', title = 'Bitcoin Daily Returns', 
                           y = '% Returns'))
# Box-Cox Normalization of Bitcoin Prices
BoxCox.lambda(bit_ts)
## [1] -0.03727455
ggplotly(cut_bit_df %>%
           mutate(WeightedPrice = BoxCox(cut_bit_df$WeightedPrice, 
                                         lambda=BoxCox.lambda(
                                           cut_bit_df$WeightedPrice))) %>%
           ggplot(aes(Date, WeightedPrice)) + geom_line(col = '#ffa500') + 
           my_theme + labs(x = '', title = 'Bitcoin Box-Cox transformed', 
                           y = 'BTC Price Transformed'))
# Plot First Difference, Transformed First Difference & Daily Returns
## Original Price
cut_bit_df[-1,] %>%
  mutate(WeightedPrice = diff(cut_bit_df$WeightedPrice)) %>%
  ggplot(aes(Date, WeightedPrice)) + geom_line(col = '#650fba') + my_theme + 
  labs(x = '', title = 'Original BTC Price', y = 'Difference')

## Transformed Price
cut_bit_df[-1,] %>%
  mutate(WeightedPrice = diff(BoxCox(cut_bit_df$WeightedPrice, 
                                     lambda = BoxCox.lambda(
                                       cut_bit_df$WeightedPrice)))) %>%
  ggplot(aes(Date, WeightedPrice)) + geom_line(col = '#650fba') + my_theme + 
  labs(x = '', title = 'Transformed BTC Price', y = '')

## Daily Returns of Price
cut_bit_ret_df[-1,] %>%
  ggplot(aes(Date, WeightedPrice_return)) + geom_line(col = '#650fba') + my_theme + 
  labs(x = '', title = 'Daily Returns of BTC Price', y = '')

# Autocorrelation (ACF) and Partial Autocorrelation (PACF) transformed Prices 
bit_ts_tran = BoxCox(bit_ts, lambda = BoxCox.lambda(bit_ts))

ggAcf(diff(bit_ts_tran), lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation') 

ggPacf(diff(bit_ts_tran), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

# Test for Stationarity
# Daily Prices Dataset
adf.test(bit_ts) # p-value < 0.05 indicates the TS is stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  bit_ts
## Dickey-Fuller = -1.6865, Lag order = 11, p-value = 0.711
## alternative hypothesis: stationary
# Box-Cox Transformed Dataset
adf.test(bit_ts_tran) # p-value < 0.05 indicates the TS is stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  bit_ts_tran
## Dickey-Fuller = -2.1671, Lag order = 11, p-value = 0.5076
## alternative hypothesis: stationary
# Daily Returns Dataset
adf.test(bit_ret_ts) # p-value < 0.05 indicates the TS is stationary
## Warning in adf.test(bit_ret_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  bit_ret_ts
## Dickey-Fuller = -10.475, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
# Try an Autoarima with transformed data
arima_tr <- auto.arima(bit_ts_tran)

checkresiduals(arima_tr)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,1) with drift
## Q* = 8.1023, df = 5, p-value = 0.1507
## 
## Model df: 5.   Total lags used: 10
# Try an Autoarima with daily returns data
arima_ret <- auto.arima(bit_ret_ts)

checkresiduals(arima_ret)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with non-zero mean
## Q* = 10.711, df = 6, p-value = 0.09773
## 
## Model df: 4.   Total lags used: 10
# Taking only 2020
cut2_bit_df = cut_bit_df %>%
  filter(Date >= ymd('2020-01-01'))

ggplotly(cut2_bit_df %>%
           mutate(WeightedPrice = BoxCox(cut2_bit_df$WeightedPrice, 
                                         lambda = BoxCox.lambda(
                                           cut2_bit_df$WeightedPrice))) %>%
           ggplot(aes(Date, WeightedPrice)) + geom_line(col = '#ffa500') + 
           labs(title = 'Bitcoin', x = '', y = 'Price (Transformed)') + my_theme)
ggplotly(cut2_bit_df[-1,] %>%
           mutate(WeightedPrice = diff(BoxCox(cut2_bit_df$WeightedPrice, 
                                      lambda = BoxCox.lambda(
                                        cut2_bit_df$WeightedPrice)))) %>%
           ggplot(aes(Date, WeightedPrice)) + geom_line(col = '#ffa500') + 
           my_theme + labs(x = '', title = 'Transformed Price', y = 'Difference'))
# ACF, PCF only for 2020
bit_ts2 = bitcoin %>%
  filter(Date >= as.Date('2020-01-01')) %>%
  arrange(Date) %>%
  select(WeightedPrice) %>%
  as.matrix() %>%
  ts()

bit_ts_tran2 = BoxCox(bit_ts2, lambda = BoxCox.lambda(bit_ts2))

ggAcf(diff(bit_ts_tran2), lag.max = 200) + my_theme + labs(title = 'ACF' , 
                                                           y = 'Correlation') 

ggPacf(diff(bit_ts_tran2), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

# Autoarima for Data only 2020
arima_2020_tr <- auto.arima(bit_ts_tran2)

checkresiduals(arima_2020_tr)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1) with drift
## Q* = 14.894, df = 8, p-value = 0.06123
## 
## Model df: 2.   Total lags used: 10
# Taking only 2021
cut3_bit_df = cut_bit_df %>%
  filter(Date >= ymd('2021-01-01'))

ggplotly(cut3_bit_df %>%
           mutate(WeightedPrice = BoxCox(cut3_bit_df$WeightedPrice, 
                                         lambda = BoxCox.lambda(
                                           cut3_bit_df$WeightedPrice))) %>%
           ggplot(aes(Date, WeightedPrice)) + geom_line(col = '#ffa500') + 
           labs(title = 'Bitcoin', x = '', y = 'Price (Transformed)') + my_theme)
ggplotly(cut3_bit_df[-1,] %>%
           mutate(WeightedPrice = diff(BoxCox(cut3_bit_df$WeightedPrice, 
                                              lambda = BoxCox.lambda(
                                                cut3_bit_df$WeightedPrice)))) %>%
           ggplot(aes(Date, WeightedPrice)) + geom_line(col = '#ffa500') + 
           my_theme + labs(x = '', title = 'Transformed Price', y = 'Difference'))
# ACF, PCF only for 2021
bit_ts3 = bitcoin %>%
  filter(Date >= as.Date('2021-01-01')) %>%
  arrange(Date) %>%
  select(WeightedPrice) %>%
  as.matrix() %>%
  ts()

bit_ts_tran3 = BoxCox(bit_ts3, lambda = BoxCox.lambda(bit_ts2))

ggAcf(diff(bit_ts_tran3), lag.max = 200) + my_theme + labs(title = 'ACF' , 
                                                           y = 'Correlation') 

ggPacf(diff(bit_ts_tran3), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

# Autoarima for Data only 2021
arima_2021_tr <- auto.arima(bit_ts_tran3)

checkresiduals(arima_2021_tr)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 13.409, df = 10, p-value = 0.2017
## 
## Model df: 0.   Total lags used: 10
# ARIMA Model Fits
## Data after 2017
bit_ts_past_2017 = bitcoin %>%
  filter(Date >= as.Date('2017-01-01')) %>%
  arrange(Date) %>%
  select(WeightedPrice) %>%
  as.matrix() %>%
  ts()

bit_ts_past_2017 %>%
  BoxCox(lambda = BoxCox.lambda(bit_ts_past_2017)) %>%
  Arima(order = c(0,1,0), include.drift = T) %>%
  checkresiduals()

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0) with drift
## Q* = 81.859, df = 9, p-value = 6.894e-14
## 
## Model df: 1.   Total lags used: 10
summary(Arima(bit_ts_tran, order = c(0,1,0), include.drift = T))
## Series: bit_ts_tran 
## ARIMA(0,1,0) with drift 
## 
## Coefficients:
##        drift
##       0.0016
## s.e.  0.0006
## 
## sigma^2 estimated as 0.000653:  log likelihood=3626.59
## AIC=-7249.18   AICc=-7249.17   BIC=-7238.41
## 
## Training set error measures:
##                        ME       RMSE        MAE           MPE      MAPE
## Training set 3.788572e-06 0.02553818 0.01755831 -1.606423e-05 0.2328859
##                   MASE      ACF1
## Training set 0.9947251 0.2064681
## Daily Return Data after 2017
bit_ts_ret_past_2017 = btc %>%
  filter(Date >= as.Date('2017-01-01')) %>%
  arrange(Date) %>%
  select(WeightedPrice_return) %>%
  as.matrix() %>%
  ts()

bit_ts_ret_past_2017 %>%
  Arima(order = c(0,1,0), include.drift = T) %>%
  checkresiduals()

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0) with drift
## Q* = 262.58, df = 9, p-value < 2.2e-16
## 
## Model df: 1.   Total lags used: 10
summary(Arima(bit_ret_ts, order = c(0,1,0), include.drift = T))
## Series: bit_ret_ts 
## ARIMA(0,1,0) with drift 
## 
## Coefficients:
##        drift
##       0.0000
## s.e.  0.0011
## 
## sigma^2 estimated as 0.002042:  log likelihood=2707.03
## AIC=-5410.07   AICc=-5410.06   BIC=-5399.3
## 
## Training set error measures:
##                        ME       RMSE        MAE      MPE    MAPE      MASE
## Training set 2.741485e-08 0.04516154 0.03086297 85.18454 445.309 0.9993754
##                    ACF1
## Training set -0.3749845
## Random Walk Test on Daily Data only 2020
bit_ts_tran2 %>%
  Arima(order = c(0,1,0), include.drift = T) %>%
  checkresiduals()

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0) with drift
## Q* = 32.082, df = 9, p-value = 0.0001927
## 
## Model df: 1.   Total lags used: 10
summary(Arima(bit_ret_ts, order = c(0,1,0), include.drift = T))
## Series: bit_ret_ts 
## ARIMA(0,1,0) with drift 
## 
## Coefficients:
##        drift
##       0.0000
## s.e.  0.0011
## 
## sigma^2 estimated as 0.002042:  log likelihood=2707.03
## AIC=-5410.07   AICc=-5410.06   BIC=-5399.3
## 
## Training set error measures:
##                        ME       RMSE        MAE      MPE    MAPE      MASE
## Training set 2.741485e-08 0.04516154 0.03086297 85.18454 445.309 0.9993754
##                    ACF1
## Training set -0.3749845
## Random Walk Test on Daily Data only 2021
bit_ts_tran3 %>%
  Arima(order = c(0,1,0), include.drift = T) %>%
  checkresiduals()

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0) with drift
## Q* = 13.43, df = 9, p-value = 0.1441
## 
## Model df: 1.   Total lags used: 10
summary(Arima(bit_ret_ts, order = c(0,1,0), include.drift = T))
## Series: bit_ret_ts 
## ARIMA(0,1,0) with drift 
## 
## Coefficients:
##        drift
##       0.0000
## s.e.  0.0011
## 
## sigma^2 estimated as 0.002042:  log likelihood=2707.03
## AIC=-5410.07   AICc=-5410.06   BIC=-5399.3
## 
## Training set error measures:
##                        ME       RMSE        MAE      MPE    MAPE      MASE
## Training set 2.741485e-08 0.04516154 0.03086297 85.18454 445.309 0.9993754
##                    ACF1
## Training set -0.3749845
# Check errors
# Transformed Data
err_tr = residuals(Arima(bit_ts_tran, order = c(0,1,0), include.drift = T))
cat('Standard Deviation = ', sd(err_tr))
## Standard Deviation =  0.02554609
cat('Mean =', mean(err_tr))
## Mean = 3.788572e-06
invers_BoxCox = function(ts_data, lambda){
  original_ts = (ts_data * lambda + 1) ** (1/lambda)
  return(original_ts)
}

invers_BoxCox(sd(err_tr), BoxCox.lambda(bit_ts))
## [1] 1.025888
# Daily Return Data
err_ret = residuals(Arima(bit_ret_ts, order = c(0,1,0), include.drift = T))
cat('Standard Deviation = ', sd(err_ret))
## Standard Deviation =  0.04517554
cat('Mean =', mean(err_ret))
## Mean = 2.741485e-08
invers_BoxCox(sd(err_ret), BoxCox.lambda(bit_ret_ts))
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
## [1] 1.045235
# Forecast with ARIMA
## h is the the length you want the prediction to be in units of days

fit_model = function(bitcoin_data, h){
  bitcoin_df = bitcoin_data %>%
    filter(Date >= as.Date('2017-01-01')) %>%
    arrange(Date)
  
  time_series = bitcoin_df %>%
    select(WeightedPrice) %>%
    ts()
  
  predictions = time_series %>%
    BoxCox(lambda = BoxCox.lambda(time_series)) %>% 
    auto.arima() %>%
    forecast(h)
  
  forecast_df = cbind(data.frame(predictions[4]), 
                      data.frame(predictions[5]), 
                      data.frame(predictions[6]))
  
  the_forecast = invers_BoxCox(forecast_df, lambda = BoxCox.lambda(time_series))
  
  the_forecast = the_forecast %>%
    mutate(Date = tail(bitcoin_df$Date, h) + h) %>%
    as_tibble()
  
  return(the_forecast)
}



# read the updated data for BTC prices

bitcoin_new <- read_csv('btc_base_dataset_NEW.csv')
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   Day_of_Week = col_character()
## )
## i Use `spec()` for the full column specifications.
# Plot the new BTC Data
ggplotly(ggplot(bitcoin_new, aes(Date, WeightedPrice)) + 
           geom_line(col = '#ffa500') + 
           labs(title = 'Bitcoin Weighted Prices 2014 -2021 (new)', x = '') +
           scale_y_continuous(breaks = c(0, 5000, 10000, 15000, 30000, 60000), 
                              labels = c('$0', '$5,000', '$10,000', '$15,000', 
                                         '$30,000', '$60,000')) + my_theme)
## Predict the BTC Price for the next 30 Days

ggplotly(fit_model(bitcoin, 30) %>%
           ggplot(aes(x = Date, y = mean)) + geom_line(col = '#ff2500') +
           geom_ribbon(aes(ymin = lower.80., ymax = upper.80.), alpha = .3, 
                       fill = '#ffc04c') +
           geom_ribbon(aes(ymin = lower.95., ymax = upper.95.), alpha = .3, 
                       fill = '#ffe4b2') +
           geom_line(data = bitcoin_new, aes(Date, WeightedPrice)) +
           geom_line(data = filter(bitcoin, Date >= as.Date('2015-01-01')), 
                     aes(Date, WeightedPrice), col = '#ffa500') + my_theme +
           labs(title = 'Bitcoin Prediction of 30 Days', y = 'Price', x = '') +
           scale_y_continuous(breaks = c(0, 5000, 10000, 15000, 20000, 25000, 
                                         30000, 35000, 40000, 45000, 50000, 
                                         55000, 60000), 
                              labels = c('$0', '$5,000', '$10,000', '$15,000',
                                         '$20,000', '$25,000', '$30,000', 
                                         '$35,000', '$40,000', '$45,000', 
                                         '$50,000', '$55,000', '$60,000')))